This work has been published in Cancer Research Communications journal in April 2023. Available: https://doi.org/10.1158/2767-9764.CRC-22-0385
The original dataset used in this project was provided by BC Cancer Gastrointestinal Cancer Outcomes Unit (GICOU) and BC Vital Statistics. It contains a total of 20136 colorectal cancer cases received radiotherapy in BC between 2002 and 2016. All personal identifiers (agency ID, PHN, and DOB, etc.) were removed before saving the data into a csv file.
library(knitr)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, dpi = 100)
library(kableExtra)
library(tidyverse)
options(dplyr.summarise.inform = FALSE)
library(xlsx)
library(binom)
library(ggrepel)
library(ggalluvial)
library(survival)
library(survminer)
data_raw <- read_csv("EOcrc_final_GI_RT_prot_w_Chemo_RTvars_20211123_personal_info_removed.csv",
show_col_types = FALSE)
cT, cN, pT,
pN into numbers.
neoadjuvant as flag for any further analysis.NA.data_cleaned <- data_raw %>%
select(diagnosis_date, site_admit_date, age_at_diagnosis, loc_at_diag, sex,
performance_status, site, behavior, hist1, final_grade, tumour_subgroup,
cT, cN, overall_clin_stg, pT, pN, m_stg, overall_path_stg, overall_stg,
neoadjuvant, M1_at_Dx, COL_tum_size, final_tot_nodes, final_pos_nodes,
final_MSI, cr_anal_verge_dist_onco, cr_anal_verge_dist_status_onco,
final_CEA, cr_prim_site_p00020, final_margin_status, GI_RT_protcode1,
cr_rt_prim_onco, cr_rt_prim_p00020, cr_system_tx_dt_onco,
cr_system_tx_dt_fst_p00020, cr_system_tx_dt_stat_onco, cr_system_tx_onco,
cr_system_tx_p00020, cr_system_tx_type_fst_p00020, surg_treat_date1,
surg_treat_date2, surg_treat_date3, surg_treat_date4, RT_start_date1,
RT_end_date1, RT_start_date2, RT_end_date2, RT_start_date3, RT_end_date3,
RT_start_date4, RT_end_date4, RT_start_date5, RT_end_date5, RT_start_date6,
RT_end_date6, m1dtofdx, m1site, fst_relapse_date, fst_reg_dist_date,
LOCIND, LOCDATE, LOCSITE, REGIND, REGDATE, REGSITE, DISTIND, DISTDATE, DISTSITE,
subseq_colorectalca, subseq_CRC_dx_date, pat_status, death_date,
FINAL_last_contact_date, survyrs, colorectaldeath, colorectaldthdat,
loctodth, regtodth, distant_date, anyreldt, anyenddt, anysurv, anystat,
locregdate, locsurv, locstat,locenddt, min_mets_date,
survyrs_from_dist_mets, relapse_to_lcd, dx_to_final_lcd) %>%
mutate(
## Assign age groups
age_group1 = cut(age_at_diagnosis, breaks = c(0, 49, Inf),
labels = c("<50", ">=50")) %>%
factor(),
age_group2 = cut(age_at_diagnosis, breaks = c(0, 29, 39, 49, Inf),
labels = c("<=29", "30-39", "40-49", ">=50")) %>%
factor(),
## remove unknown status (mostly number 8 or 9)
across(c(performance_status, final_grade, overall_path_stg,
overall_clin_stg, final_margin_status, colorectaldeath),
~ifelse(.x <= 4, .x, NA)),
final_MSI = ifelse(final_MSI %in% c(7, 8), NA, final_MSI),
## convert T and N stage info into numbers
across(c(cT, cN, pT, pN), parse_number),
## remove unknown node info (numbers > 90)
across(c(final_tot_nodes, final_pos_nodes), ~ifelse(.x > 90, NA, .x)),
## remove unknown CEA info (numbers > 988), convert 980 to 98 (>98 category)
final_CEA = ifelse(final_CEA > 988, NA, ifelse(final_CEA == 980, 98, final_CEA)),
## remove unknown margin status
final_margin_status = ifelse(final_margin_status > 2, NA, final_margin_status),
## convert into date format
across(c(diagnosis_date, site_admit_date, FINAL_last_contact_date,
colorectaldthdat, distant_date, anyreldt, anyenddt,
locregdate, locenddt, min_mets_date),
~as.Date(.x, "%d-%b-%y")),
across(c(cr_system_tx_dt_onco, surg_treat_date1, surg_treat_date2,
surg_treat_date3, surg_treat_date4, RT_start_date1, RT_end_date1,
RT_start_date2, RT_end_date2, RT_start_date3, RT_end_date3,
RT_start_date4, RT_end_date4, RT_start_date5, RT_end_date5,
RT_start_date6, RT_end_date6, m1dtofdx, fst_relapse_date,
fst_reg_dist_date, LOCDATE, REGDATE, DISTDATE, subseq_CRC_dx_date,
death_date),
~as.Date(.x, "%m/%d/%Y")))
Any data entry errors found throughout the entire analysis were corrected in this step. (Codes are hidden here.)
site == "C209"
(site_desc == "Rectum, NOS") to select all rectal cancer
cases.tumor_subgroup == "RE" as it contains
non-rectal cancer cases.hist1 == 80453
(hist1_desc == "Combined small cell carcinoma") and 9 cases
of hist1 == 81482
(hist1_desc == "Glandular intraepithelial heoplasia, grade III).cr_prim_site_p00020 == "B"
(cr_prim_site_p00020 == "Rectosigmoid Junction").cr_prim_site_p00020 and
cr_anal_verge_dist_onco.
dist_merged and
loc_merged.dist_merged, e.g. "Rectum, upper (11-15 cm)" →
13, "Rectum, mid (5-10.9 cm)" →
8.loc_merged.cr_rt_prim_onco and
cr_rt_prim_p00020.
rt_merged columnrt_merged values 1 (pre-op short) and 2 (pre-op
long), categorize into long course group and short course grouprt_merged values 5, 77, 88 whether any cases
qualify for pre-op RT (compare surg_treat_date1 and
RT_start_date1) → result: none qualifieddata_filtered <- data_cleaned %>%
## select all rectal cancer cases
filter(site == "C209", hist1 != 80453, hist1 != 81482,
(cr_prim_site_p00020 != "B") |
is.na(cr_prim_site_p00020),
(cr_anal_verge_dist_onco <= 15) |
(cr_anal_verge_dist_onco %in% c(89, 99)) |
is.na(cr_anal_verge_dist_onco)) %>%
## merge distance/location info for further processing
unite("dist_merged", c(cr_prim_site_p00020, cr_anal_verge_dist_onco),
na.rm = TRUE, remove = FALSE) %>%
## merge RT info for further processing
unite("rt_merged", c(cr_rt_prim_onco, cr_rt_prim_p00020),
na.rm = TRUE, remove = FALSE) %>%
mutate(
## relabel sex
sex = factor(sex, levels = c("F", "M"), labels = c("Female", "Male")),
## determine rural/urban
rural_urban = ifelse(is.na(loc_at_diag), "Unknown",
ifelse(grepl(".0....", loc_at_diag), "Rural", "Urban")) %>%
factor(levels = c("Rural", "Urban", "Unknown")),
## convert letter into avg distance
dist_merged = case_when(dist_merged == "C" ~ 13, # rectum, upper (11-15 cm)
dist_merged == "D" ~ 8, # rectum, mid (5-10.9 cm)
dist_merged == "E" ~ 2.5, # rectum, distal (<5 cm)
TRUE ~ as.numeric(dist_merged)),
## convert distance to letter
loc_merged = case_when(cr_prim_site_p00020 == "C" ~ "Proximal (11-15 cm)",
cr_prim_site_p00020 == "D" ~ "Mid (5-10.9 cm)",
cr_prim_site_p00020 == "E" ~ "Distal (<5 cm)",
# cr_prim_site_p00020 == "F" ~ "NOS",
is.na(cr_prim_site_p00020) &
((cr_anal_verge_dist_onco >= 11) &
(cr_anal_verge_dist_onco <= 15)) ~
"Proximal (11-15 cm)",
is.na(cr_prim_site_p00020) &
((cr_anal_verge_dist_onco >= 5) &
(cr_anal_verge_dist_onco < 11)) ~
"Mid (5-10.9 cm)",
is.na(cr_prim_site_p00020) &
(cr_anal_verge_dist_onco < 5) ~
"Distal (<5 cm)") %>%
# TRUE ~ "NOS") %>%
factor(levels = c("Distal (<5 cm)", "Mid (5-10.9 cm)",
"Proximal (11-15 cm)")),
## pCR status
pCR = case_when(((pT == 0) & (pN == 0)) | (overall_path_stg == 0) ~ "Yes",
!is.na(pT) & !is.na(pN) ~ "No") %>%
factor(levels = c("No", "Yes")),
pT = case_when(pCR == "Yes" ~ 0,
TRUE ~ pT),
pN = case_when(pCR == "Yes" ~ 0,
TRUE ~ pN),
overall_path_stg = case_when(pCR == "Yes" ~ 0,
TRUE ~ overall_path_stg),
## calculate changes in cancer stages
T_stg_change = pT - cT,
T_stg_change_status = case_when(T_stg_change < 0 ~ "Downstage",
T_stg_change == 0 ~ "No change",
T_stg_change > 0 ~ "Upstage") %>%
factor(),
N_stg_change = pN - cN,
N_stg_change_status = case_when(N_stg_change < 0 ~ "Downstage",
N_stg_change == 0 ~ "No change",
N_stg_change > 0 ~ "Upstage") %>%
factor(),
overall_stg_change = overall_path_stg - overall_clin_stg,
overall_stg_change_status = case_when(overall_stg_change < 0 ~ "Downstage",
overall_stg_change == 0 ~ "No change",
overall_stg_change > 0 ~ "Upstage") %>%
factor(),
## pCR status
pCR = case_when((pT == 0) & (pN == 0) ~ "Yes",
!is.na(pT) & !is.na(pN) ~ "No") %>%
factor(levels = c("No", "Yes")),
## calculate RT durations
RT_duration1 = RT_end_date1 - RT_start_date1,
RT_duration2 = RT_end_date2 - RT_start_date2,
RT_duration3 = RT_end_date3 - RT_start_date3,
RT_duration4 = RT_end_date4 - RT_start_date4,
RT_duration5 = RT_end_date5 - RT_start_date5,
RT_duration6 = RT_end_date6 - RT_start_date6,
## identify pre-op RT and categorize into short and long course groups
rt_merged = as.integer(rt_merged),
rt_course = case_when(rt_merged == 1 ~ "Short-course RT",
(rt_merged == 2) & (is.na(GI_RT_protcode1)) ~ "Long-course RT",
(rt_merged == 2) & (!is.na(GI_RT_protcode1)) ~ "Long-course chemoRT") %>%
factor(levels = c("Short-course RT", "Long-course RT", "Long-course chemoRT")),
## categorize tumor grade
grade = case_when(final_grade %in% c(1, 2) ~ "Well-Differentiated (Grades 1 and 2)",
final_grade %in% c(3, 4) ~ "Poorly Differentiated (Grades 3 and 4)",
final_grade == 0 ~ "Grade 0") %>%
factor(levels = c("Grade 0", "Well-Differentiated (Grades 1 and 2)",
"Poorly Differentiated (Grades 3 and 4)")),
## categorize cancer types
histology = case_when(hist1 %in% c(80103, 80203, 80463, 81402, 81403, 81443,
82102, 82103, 82113, 82203, 82213, 82553,
82612, 82613, 82632, 82633) ~ "Adenocarcinoma",
hist1 %in% c(84803, 84813, 84903) ~ "Mucinous/Signet",
TRUE ~ "Other") %>%
factor(),
## categorize MSI
MSI_group = case_when(final_MSI %in% c(1, 2) ~ "MSI stable",
final_MSI %in% c(3, 9) ~ "MSI unstable, NOS; positive, NOS") %>%
factor(),
## categorize margin status
margin = case_when(final_margin_status == 0 ~ "Microscopic Margins Clear",
final_margin_status == 1 ~ "Microscopic Margins positive (<=1mm)",
final_margin_status == 2 ~ "Macroscopic Residual Cancer, grossly positive") %>%
factor(levels = c("Microscopic Margins Clear", "Microscopic Margins positive (<=1mm)",
"Macroscopic Residual Cancer, grossly positive")),
## categorize node status
node_total = case_when(final_tot_nodes >= 12 ~ ">=12",
final_tot_nodes < 12 ~ "<12") %>%
factor(),
node_positive = case_when(final_pos_nodes == 0 ~ "0",
final_pos_nodes %in% c(1, 2) ~ "1-2",
final_pos_nodes %in% c(3, 4) ~ "3-4",
(final_pos_nodes >= 5) & (final_pos_nodes <= 9) ~ "5-9",
final_pos_nodes >= 10 ~ "10+") %>%
factor(levels = c("0", "1-2", "3-4", "5-9", "10+")),
## overall survival status
surv_status = case_when(pat_status == "A" ~ 0,
pat_status == "D" ~ 1),
## disease specific status
crc_death_status = case_when(colorectaldeath == 1 ~ 1,
TRUE ~ 0),
## disease free status
recurr_status = case_when(LOCIND | REGIND | DISTIND |
subseq_colorectalca | surv_status ~ 1,
TRUE ~ 0),
dis_free_yrs = pmin(as.numeric((pmin(LOCDATE, REGDATE, DISTDATE, subseq_CRC_dx_date, death_date,
na.rm = TRUE)-diagnosis_date)/365.25),
survyrs, na.rm = TRUE),
## disease free status with non CRC censored
crc_recurr_status = case_when(LOCIND | REGIND | DISTIND |
subseq_colorectalca | crc_death_status ~ 1,
TRUE ~ 0),
crc_recurr_free_yrs = pmin(as.numeric((pmin(LOCDATE, REGDATE, DISTDATE, subseq_CRC_dx_date, colorectaldthdat,
na.rm = TRUE)-diagnosis_date)/365.25),
survyrs, na.rm = TRUE)
)
summary_general <- function(df, group_var) {
var <- deparse(substitute(group_var))
Reduce(function(x, y) merge(x, y, by = var, all = TRUE),
list(df %>%
filter(M1_at_Dx == 0) %>%
group_by(age_group2, {{group_var}}) %>%
summarize(sub_total = n()) %>%
pivot_wider(names_from = age_group2, values_from = sub_total) %>%
mutate_at(vars(-var), ~replace(., is.na(.), 0)) %>%
select(-`>=50`),
df %>%
filter(M1_at_Dx == 0) %>%
group_by(age_group1, {{group_var}}) %>%
summarize(sub_total = n()) %>%
pivot_wider(names_from = age_group1, values_from = sub_total) %>%
mutate_at(vars(-var), ~replace(., is.na(.), 0)),
df %>%
filter(M1_at_Dx == 0) %>%
group_by({{group_var}}) %>%
summarize(`Grand Total` = n()) %>%
mutate_at(vars(-var), ~replace(., is.na(.), 0)))
) %>%
arrange({{group_var}}) %>%
mutate({{group_var}} := as.character({{group_var}})) %>%
filter(!is.na({{group_var}})) %>%
bind_rows(summarize(., across(where(is.numeric), sum),
across(where(is.character), ~"Subtotal"))) %>%
rename(Parameter = {{group_var}}) %>%
# add percentage
mutate_at(vars(-Parameter),
~if_else((Parameter != "Subtotal") & (.x != 0),
paste0(.x, ' (', sprintf("%.1f", round(.x/.x[Parameter == "Subtotal"]*100, 1)), ')'),
# paste0(.x, ' (', sprintf("%.1f", round(.x/sum(.x)*100, 1)), ')'),
as.character(.x)))
}
| Baseline Characteristics | <=29 | 30-39 | 40-49 | <50 | >=50 | Grand Total |
|---|---|---|---|---|---|---|
| Sex | ||||||
| Female | 11 (57.9) | 47 (44.3) | 158 (38.8) | 216 (40.6) | 2002 (35.1) | 2218 (35.6) |
| Male | 8 (42.1) | 59 (55.7) | 249 (61.2) | 316 (59.4) | 3698 (64.9) | 4014 (64.4) |
| Subtotal | 19 | 106 | 407 | 532 | 5700 | 6232 |
| Geographic Location | ||||||
| Rural | 2 (10.5) | 11 (10.4) | 53 (13.0) | 66 (12.4) | 838 (14.7) | 904 (14.5) |
| Urban | 17 (89.5) | 95 (89.6) | 352 (86.5) | 464 (87.2) | 4859 (85.2) | 5323 (85.4) |
| Unknown | 0 | 0 | 2 (0.5) | 2 (0.4) | 3 (0.1) | 5 (0.1) |
| Subtotal | 19 | 106 | 407 | 532 | 5700 | 6232 |
| Tumor Location | ||||||
| Distal (<5 cm) | 2 (22.2) | 25 (30.9) | 95 (29.9) | 122 (29.9) | 1209 (28.4) | 1331 (28.6) |
| Mid (5-10.9 cm) | 5 (55.6) | 43 (53.1) | 165 (51.9) | 213 (52.2) | 2339 (55.0) | 2552 (54.8) |
| Proximal (11-15 cm) | 2 (22.2) | 13 (16.0) | 58 (18.2) | 73 (17.9) | 703 (16.5) | 776 (16.7) |
| Subtotal | 9 | 81 | 318 | 408 | 4251 | 4659 |
| Tumor Grade | ||||||
| Well-Differentiated (Grades 1 and 2) | 14 (77.8) | 81 (87.1) | 311 (87.1) | 406 (86.8) | 4456 (88.9) | 4862 (88.7) |
| Poorly Differentiated (Grades 3 and 4) | 4 (22.2) | 12 (12.9) | 46 (12.9) | 62 (13.2) | 555 (11.1) | 617 (11.3) |
| Subtotal | 18 | 93 | 357 | 468 | 5011 | 5479 |
| Histology | ||||||
| Adenocarcinoma | 18 (94.7) | 98 (92.5) | 396 (97.3) | 512 (96.2) | 5475 (96.1) | 5987 (96.1) |
| Mucinous/Signet | 1 (5.3) | 7 (6.6) | 10 (2.5) | 18 (3.4) | 207 (3.6) | 225 (3.6) |
| Other | 0 | 1 (0.9) | 1 (0.2) | 2 (0.4) | 18 (0.3) | 20 (0.3) |
| Subtotal | 19 | 106 | 407 | 532 | 5700 | 6232 |
| MSI | ||||||
| MSI stable | 4 (66.7) | 24 (85.7) | 77 (76.2) | 105 (77.8) | 369 (56.1) | 474 (59.8) |
| MSI unstable, NOS; positive, NOS | 2 (33.3) | 4 (14.3) | 24 (23.8) | 30 (22.2) | 289 (43.9) | 319 (40.2) |
| Subtotal | 6 | 28 | 101 | 135 | 658 | 793 |
| Margin | ||||||
| Microscopic Margins Clear | 9 (100.0) | 72 (86.7) | 282 (84.4) | 363 (85.2) | 3891 (88.6) | 4254 (88.3) |
| Microscopic Margins positive (<=1mm) | 0 | 10 (12.0) | 50 (15.0) | 60 (14.1) | 471 (10.7) | 531 (11.0) |
| Macroscopic Residual Cancer, grossly positive | 0 | 1 (1.2) | 2 (0.6) | 3 (0.7) | 32 (0.7) | 35 (0.7) |
| Subtotal | 9 | 83 | 334 | 426 | 4394 | 4820 |
| cT | ||||||
| 0 | 0 | 0 | 0 | 0 | 1 (0.0) | 1 (0.0) |
| 1 | 0 | 4 (5.6) | 3 (1.0) | 7 (1.8) | 117 (3.0) | 124 (2.9) |
| 2 | 1 (7.1) | 7 (9.9) | 42 (13.9) | 50 (12.9) | 622 (15.9) | 672 (15.6) |
| 3 | 11 (78.6) | 50 (70.4) | 228 (75.5) | 289 (74.7) | 2812 (71.9) | 3101 (72.2) |
| 4 | 2 (14.3) | 10 (14.1) | 29 (9.6) | 41 (10.6) | 357 (9.1) | 398 (9.3) |
| Subtotal | 14 | 71 | 302 | 387 | 3909 | 4296 |
| cN | ||||||
| 0 | 4 (30.8) | 23 (37.1) | 117 (42.9) | 144 (41.4) | 1873 (52.9) | 2017 (51.9) |
| 1 | 6 (46.2) | 34 (54.8) | 124 (45.4) | 164 (47.1) | 1409 (39.8) | 1573 (40.5) |
| 2 | 3 (23.1) | 5 (8.1) | 32 (11.7) | 40 (11.5) | 258 (7.3) | 298 (7.7) |
| Subtotal | 13 | 62 | 273 | 348 | 3540 | 3888 |
| Overall Clinical Stage | ||||||
| 0 | 0 | 0 | 0 | 0 | 6 (0.2) | 6 (0.2) |
| 1 | 0 | 2 (3.2) | 3 (1.1) | 5 (1.5) | 91 (2.7) | 96 (2.6) |
| 2 | 4 (30.8) | 21 (33.3) | 108 (40.4) | 133 (38.8) | 1627 (48.0) | 1760 (47.2) |
| 3 | 9 (69.2) | 40 (63.5) | 156 (58.4) | 205 (59.8) | 1664 (49.1) | 1869 (50.1) |
| Subtotal | 13 | 63 | 267 | 343 | 3388 | 3731 |
| pT | ||||||
| 0 | 1 (6.7) | 3 (3.1) | 32 (8.4) | 36 (7.3) | 253 (5.0) | 289 (5.2) |
| 1 | 0 | 15 (15.3) | 38 (9.9) | 53 (10.7) | 526 (10.3) | 579 (10.3) |
| 2 | 1 (6.7) | 22 (22.4) | 92 (24.0) | 115 (23.2) | 1449 (28.4) | 1564 (27.9) |
| 3 | 11 (73.3) | 51 (52.0) | 193 (50.4) | 255 (51.4) | 2628 (51.4) | 2883 (51.4) |
| 4 | 2 (13.3) | 7 (7.1) | 28 (7.3) | 37 (7.5) | 254 (5.0) | 291 (5.2) |
| Subtotal | 15 | 98 | 383 | 496 | 5110 | 5606 |
| pN | ||||||
| 0 | 4 (26.7) | 48 (49.5) | 200 (54.2) | 252 (52.4) | 2837 (58.4) | 3089 (57.8) |
| 1 | 9 (60.0) | 32 (33.0) | 108 (29.3) | 149 (31.0) | 1368 (28.1) | 1517 (28.4) |
| 2 | 2 (13.3) | 17 (17.5) | 61 (16.5) | 80 (16.6) | 656 (13.5) | 736 (13.8) |
| Subtotal | 15 | 97 | 369 | 481 | 4861 | 5342 |
| Overall Pathological Stage | ||||||
| 0 | 1 (6.7) | 3 (3.2) | 26 (7.1) | 30 (6.3) | 223 (4.6) | 253 (4.8) |
| 1 | 0 | 8 (8.4) | 19 (5.2) | 27 (5.7) | 276 (5.7) | 303 (5.7) |
| 2 | 3 (20.0) | 44 (46.3) | 183 (49.9) | 230 (48.2) | 2572 (53.5) | 2802 (53.0) |
| 3 | 11 (73.3) | 40 (42.1) | 139 (37.9) | 190 (39.8) | 1738 (36.1) | 1928 (36.5) |
| Subtotal | 15 | 95 | 367 | 477 | 4809 | 5286 |
summary_pre_op_rt <- function(df, rt_course_filter, group_var) {
rt_filter <- case_when(rt_course_filter == "short" ~ "Short-course RT",
rt_course_filter == "long" ~ "Long-course chemoRT",
rt_course_filter == "all" ~ c("Short-course RT","Long-course chemoRT"))
var <- deparse(substitute(group_var))
Reduce(function(x, y) merge(x, y, by = var, all = TRUE),
list(df %>%
filter(rt_course %in% rt_filter,
M1_at_Dx == 0,
neoadjuvant == 1) %>%
group_by(age_group2, {{group_var}}, .drop = FALSE) %>%
summarize(sub_total = n()) %>%
pivot_wider(names_from = age_group2, values_from = sub_total) %>%
mutate_at(vars(-var), ~replace(., is.na(.), 0)) %>%
select(-`>=50`),
df %>%
filter(rt_course %in% rt_filter,
M1_at_Dx == 0,
neoadjuvant == 1) %>%
group_by(age_group1, {{group_var}}, .drop = FALSE) %>%
summarize(sub_total = n()) %>%
pivot_wider(names_from = age_group1, values_from = sub_total) %>%
mutate_at(vars(-var), ~replace(., is.na(.), 0)),
df %>%
filter(rt_course %in% rt_filter,
M1_at_Dx == 0,
neoadjuvant == 1) %>%
group_by({{group_var}}, .drop = FALSE) %>%
summarize(`Grand Total` = n()) %>%
mutate_at(vars(-var), ~replace(., is.na(.), 0)))
) %>%
arrange({{group_var}}) %>%
mutate({{group_var}} := as.character({{group_var}})) %>%
filter(!is.na({{group_var}})) %>%
bind_rows(summarize(., across(where(is.numeric), sum),
across(where(is.character), ~"Subtotal"))) %>%
rename(Parameter = {{group_var}}) %>%
# add percentage
mutate_at(vars(-Parameter),
~if_else((Parameter != "Subtotal") & (.x != 0),
paste0(.x, ' (', sprintf("%.1f", round(.x/.x[Parameter == "Subtotal"]*100, 1)), ')'),
# paste0(.x, ' (', sprintf("%.1f", round(.x/sum(.x)*100, 1)), ')'),
as.character(.x)))
}
| All RT course | <=29 | 30-39 | 40-49 | <50 | >=50 | Grand Total |
|---|---|---|---|---|---|---|
| Neoadjuvant Therapy | ||||||
| Short-course RT | 1 (25.0) | 13 (26.5) | 90 (45.9) | 104 (41.8) | 1439 (61.2) | 1543 (59.3) |
| Long-course chemoRT | 3 (75.0) | 36 (73.5) | 106 (54.1) | 145 (58.2) | 913 (38.8) | 1058 (40.7) |
| Subtotal | 4 | 49 | 196 | 249 | 2352 | 2601 |
| cT Stage | ||||||
| 1 | 0 | 0 | 0 | 0 | 9 (0.4) | 9 (0.4) |
| 2 | 1 (25.0) | 7 (14.9) | 24 (12.6) | 32 (13.2) | 295 (13.4) | 327 (13.4) |
| 3 | 1 (25.0) | 36 (76.6) | 161 (84.3) | 198 (81.8) | 1783 (80.9) | 1981 (81.0) |
| 4 | 2 (50.0) | 4 (8.5) | 6 (3.1) | 12 (5.0) | 117 (5.3) | 129 (5.3) |
| Subtotal | 4 | 47 | 191 | 242 | 2204 | 2446 |
| cN Stage | ||||||
| 0 | 2 (50.0) | 13 (30.2) | 66 (38.8) | 81 (37.3) | 945 (48.4) | 1026 (47.3) |
| 1 | 1 (25.0) | 25 (58.1) | 83 (48.8) | 109 (50.2) | 869 (44.5) | 978 (45.1) |
| 2 | 1 (25.0) | 5 (11.6) | 21 (12.4) | 27 (12.4) | 138 (7.1) | 165 (7.6) |
| Subtotal | 4 | 43 | 170 | 217 | 1952 | 2169 |
| Overall Clinical Stage | ||||||
| 1 | 0 | 0 | 0 | 0 | 7 (0.4) | 7 (0.3) |
| 2 | 2 (50.0) | 15 (34.1) | 73 (42.9) | 90 (41.3) | 965 (50.1) | 1055 (49.2) |
| 3 | 2 (50.0) | 29 (65.9) | 97 (57.1) | 128 (58.7) | 953 (49.5) | 1081 (50.4) |
| Subtotal | 4 | 44 | 170 | 218 | 1925 | 2143 |
| ypT Stage | ||||||
| 0 | 1 (25.0) | 3 (6.2) | 22 (11.4) | 26 (10.6) | 165 (7.1) | 191 (7.4) |
| 1 | 0 | 6 (12.5) | 10 (5.2) | 16 (6.5) | 127 (5.5) | 143 (5.6) |
| 2 | 0 | 8 (16.7) | 52 (26.9) | 60 (24.5) | 685 (29.4) | 745 (29.0) |
| 3 | 3 (75.0) | 28 (58.3) | 101 (52.3) | 132 (53.9) | 1263 (54.3) | 1395 (54.2) |
| 4 | 0 | 3 (6.2) | 8 (4.1) | 11 (4.5) | 87 (3.7) | 98 (3.8) |
| Subtotal | 4 | 48 | 193 | 245 | 2327 | 2572 |
| ypN Stage | ||||||
| 0 | 1 (25.0) | 26 (53.1) | 103 (53.4) | 130 (52.8) | 1451 (62.0) | 1581 (61.2) |
| 1 | 3 (75.0) | 13 (26.5) | 49 (25.4) | 65 (26.4) | 619 (26.5) | 684 (26.5) |
| 2 | 0 | 10 (20.4) | 41 (21.2) | 51 (20.7) | 269 (11.5) | 320 (12.4) |
| Subtotal | 4 | 49 | 193 | 246 | 2339 | 2585 |
| Overall Pathological Stage | ||||||
| 0 | 1 (25.0) | 3 (6.2) | 16 (8.3) | 20 (8.2) | 147 (6.3) | 167 (6.5) |
| 1 | 0 | 4 (8.3) | 8 (4.2) | 12 (4.9) | 110 (4.7) | 122 (4.8) |
| 2 | 0 | 19 (39.6) | 99 (51.6) | 118 (48.4) | 1304 (56.2) | 1422 (55.5) |
| 3 | 3 (75.0) | 22 (45.8) | 69 (35.9) | 94 (38.5) | 759 (32.7) | 853 (33.3) |
| Subtotal | 4 | 48 | 192 | 244 | 2320 | 2564 |
| T Stage Change | ||||||
| Downstage | 3 (75.0) | 18 (39.1) | 76 (40.2) | 97 (40.6) | 836 (38.3) | 933 (38.6) |
| No change | 0 | 23 (50.0) | 101 (53.4) | 124 (51.9) | 1178 (54.0) | 1302 (53.8) |
| Upstage | 1 (25.0) | 5 (10.9) | 12 (6.3) | 18 (7.5) | 166 (7.6) | 184 (7.6) |
| Subtotal | 4 | 46 | 189 | 239 | 2180 | 2419 |
| N Stage Change | ||||||
| Downstage | 1 (25.0) | 16 (37.2) | 47 (27.8) | 64 (29.6) | 575 (29.6) | 639 (29.6) |
| No change | 2 (50.0) | 18 (41.9) | 80 (47.3) | 100 (46.3) | 974 (50.2) | 1074 (49.8) |
| Upstage | 1 (25.0) | 9 (20.9) | 42 (24.9) | 52 (24.1) | 391 (20.2) | 443 (20.5) |
| Subtotal | 4 | 43 | 169 | 216 | 1940 | 2156 |
| Overal Stage Change | ||||||
| Downstage | 1 (25.0) | 17 (39.5) | 68 (40.5) | 86 (40.0) | 662 (34.8) | 748 (35.4) |
| No change | 2 (50.0) | 21 (48.8) | 81 (48.2) | 104 (48.4) | 1011 (53.2) | 1115 (52.7) |
| Upstage | 1 (25.0) | 5 (11.6) | 19 (11.3) | 25 (11.6) | 227 (11.9) | 252 (11.9) |
| Subtotal | 4 | 43 | 168 | 215 | 1900 | 2115 |
| Total Number of Nodes | ||||||
| <12 | 0 | 10 (20.4) | 70 (36.1) | 80 (32.4) | 1000 (43.0) | 1080 (42.0) |
| >=12 | 4 (100.0) | 39 (79.6) | 124 (63.9) | 167 (67.6) | 1325 (57.0) | 1492 (58.0) |
| Subtotal | 4 | 49 | 194 | 247 | 2325 | 2572 |
| Total Number of Positive Nodes | ||||||
| 0 | 1 (25.0) | 27 (55.1) | 106 (54.9) | 134 (54.5) | 1484 (64.1) | 1618 (63.2) |
| 1-2 | 2 (50.0) | 8 (16.3) | 39 (20.2) | 49 (19.9) | 476 (20.6) | 525 (20.5) |
| 3-4 | 1 (25.0) | 4 (8.2) | 20 (10.4) | 25 (10.2) | 153 (6.6) | 178 (7.0) |
| 5-9 | 0 | 3 (6.1) | 23 (11.9) | 26 (10.6) | 142 (6.1) | 168 (6.6) |
| 10+ | 0 | 7 (14.3) | 5 (2.6) | 12 (4.9) | 60 (2.6) | 72 (2.8) |
| Subtotal | 4 | 49 | 193 | 246 | 2315 | 2561 |
| pCR Status | ||||||
| No | 3 (75.0) | 45 (93.8) | 176 (91.7) | 224 (91.8) | 2170 (93.7) | 2394 (93.5) |
| Yes | 1 (25.0) | 3 (6.2) | 16 (8.3) | 20 (8.2) | 147 (6.3) | 167 (6.5) |
| Subtotal | 4 | 48 | 192 | 244 | 2317 | 2561 |
| Short-course RT | <=29 | 30-39 | 40-49 | <50 | >=50 | Grand Total |
|---|---|---|---|---|---|---|
| cT Stage | ||||||
| 1 | 0 | 0 | 0 | 0 | 8 (0.6) | 8 (0.6) |
| 2 | 1 (100.0) | 3 (27.3) | 18 (20.7) | 22 (22.2) | 240 (18.2) | 262 (18.5) |
| 3 | 0 | 7 (63.6) | 69 (79.3) | 76 (76.8) | 1063 (80.7) | 1139 (80.4) |
| 4 | 0 | 1 (9.1) | 0 | 1 (1.0) | 6 (0.5) | 7 (0.5) |
| Subtotal | 1 | 11 | 87 | 99 | 1317 | 1416 |
| cN Stage | ||||||
| 0 | 1 (100.0) | 4 (44.4) | 33 (46.5) | 38 (46.9) | 690 (61.3) | 728 (60.3) |
| 1 | 0 | 5 (55.6) | 33 (46.5) | 38 (46.9) | 399 (35.4) | 437 (36.2) |
| 2 | 0 | 0 | 5 (7.0) | 5 (6.2) | 37 (3.3) | 42 (3.5) |
| Subtotal | 1 | 9 | 71 | 81 | 1126 | 1207 |
| Overall Clinical Stage | ||||||
| 1 | 0 | 0 | 0 | 0 | 7 (0.6) | 7 (0.6) |
| 2 | 1 (100.0) | 4 (44.4) | 40 (56.3) | 45 (55.6) | 718 (65.3) | 763 (64.7) |
| 3 | 0 | 5 (55.6) | 31 (43.7) | 36 (44.4) | 374 (34.0) | 410 (34.7) |
| Subtotal | 1 | 9 | 71 | 81 | 1099 | 1180 |
| ypT Stage | ||||||
| 0 | 0 | 0 | 2 (2.2) | 2 (1.9) | 27 (1.9) | 29 (1.9) |
| 1 | 0 | 2 (15.4) | 7 (7.8) | 9 (8.7) | 73 (5.1) | 82 (5.3) |
| 2 | 0 | 2 (15.4) | 28 (31.1) | 30 (28.8) | 460 (32.0) | 490 (31.8) |
| 3 | 1 (100.0) | 9 (69.2) | 48 (53.3) | 58 (55.8) | 841 (58.6) | 899 (58.4) |
| 4 | 0 | 0 | 5 (5.6) | 5 (4.8) | 35 (2.4) | 40 (2.6) |
| Subtotal | 1 | 13 | 90 | 104 | 1436 | 1540 |
| ypN Stage | ||||||
| 0 | 0 | 5 (38.5) | 44 (49.4) | 49 (47.6) | 869 (60.4) | 918 (59.6) |
| 1 | 1 (100.0) | 4 (30.8) | 19 (21.3) | 24 (23.3) | 370 (25.7) | 394 (25.6) |
| 2 | 0 | 4 (30.8) | 26 (29.2) | 30 (29.1) | 199 (13.8) | 229 (14.9) |
| Subtotal | 1 | 13 | 89 | 103 | 1438 | 1541 |
| Overall Pathological Stage | ||||||
| 0 | 0 | 0 | 2 (2.2) | 2 (1.9) | 24 (1.7) | 26 (1.7) |
| 1 | 0 | 1 (7.7) | 5 (5.6) | 6 (5.8) | 65 (4.5) | 71 (4.6) |
| 2 | 0 | 5 (38.5) | 45 (50.6) | 50 (48.5) | 867 (60.4) | 917 (59.6) |
| 3 | 1 (100.0) | 7 (53.8) | 37 (41.6) | 45 (43.7) | 479 (33.4) | 524 (34.1) |
| Subtotal | 1 | 13 | 89 | 103 | 1435 | 1538 |
| T Stage Change | ||||||
| Downstage | 0 | 4 (36.4) | 28 (32.2) | 32 (32.3) | 398 (30.3) | 430 (30.4) |
| No change | 0 | 5 (45.5) | 49 (56.3) | 54 (54.5) | 778 (59.2) | 832 (58.9) |
| Upstage | 1 (100.0) | 2 (18.2) | 10 (11.5) | 13 (13.1) | 138 (10.5) | 151 (10.7) |
| Subtotal | 1 | 11 | 87 | 99 | 1314 | 1413 |
| N Stage Change | ||||||
| Downstage | 0 | 2 (22.2) | 13 (18.3) | 15 (18.5) | 210 (18.7) | 225 (18.7) |
| No change | 0 | 4 (44.4) | 32 (45.1) | 36 (44.4) | 628 (55.8) | 664 (55.1) |
| Upstage | 1 (100.0) | 3 (33.3) | 26 (36.6) | 30 (37.0) | 287 (25.5) | 317 (26.3) |
| Subtotal | 1 | 9 | 71 | 81 | 1125 | 1206 |
| Overal Stage Change | ||||||
| Downstage | 0 | 4 (44.4) | 19 (26.8) | 23 (28.4) | 251 (22.9) | 274 (23.3) |
| No change | 0 | 3 (33.3) | 38 (53.5) | 41 (50.6) | 654 (59.7) | 695 (59.1) |
| Upstage | 1 (100.0) | 2 (22.2) | 14 (19.7) | 17 (21.0) | 190 (17.4) | 207 (17.6) |
| Subtotal | 1 | 9 | 71 | 81 | 1095 | 1176 |
| Total Number of Nodes | ||||||
| <12 | 0 | 3 (23.1) | 35 (39.3) | 38 (36.9) | 599 (42.3) | 637 (41.9) |
| >=12 | 1 (100.0) | 10 (76.9) | 54 (60.7) | 65 (63.1) | 818 (57.7) | 883 (58.1) |
| Subtotal | 1 | 13 | 89 | 103 | 1417 | 1520 |
| Total Number of Positive Nodes | ||||||
| 0 | 0 | 5 (38.5) | 44 (50.0) | 49 (48.0) | 883 (61.7) | 932 (60.8) |
| 1-2 | 1 (100.0) | 2 (15.4) | 14 (15.9) | 17 (16.7) | 292 (20.4) | 309 (20.2) |
| 3-4 | 0 | 2 (15.4) | 11 (12.5) | 13 (12.7) | 102 (7.1) | 115 (7.5) |
| 5-9 | 0 | 1 (7.7) | 16 (18.2) | 17 (16.7) | 103 (7.2) | 120 (7.8) |
| 10+ | 0 | 3 (23.1) | 3 (3.4) | 6 (5.9) | 50 (3.5) | 56 (3.7) |
| Subtotal | 1 | 13 | 88 | 102 | 1430 | 1532 |
| pCR Status | ||||||
| No | 1 (100.0) | 13 (100.0) | 87 (97.8) | 101 (98.1) | 1411 (98.3) | 1512 (98.3) |
| Yes | 0 | 0 | 2 (2.2) | 2 (1.9) | 24 (1.7) | 26 (1.7) |
| Subtotal | 1 | 13 | 89 | 103 | 1435 | 1538 |
| Long-course chemoRT | <=29 | 30-39 | 40-49 | <50 | >=50 | Grand Total |
|---|---|---|---|---|---|---|
| cT Stage | ||||||
| 1 | 0 | 0 | 0 | 0 | 1 (0.1) | 1 (0.1) |
| 2 | 0 | 4 (11.1) | 6 (5.8) | 10 (7.0) | 55 (6.2) | 65 (6.3) |
| 3 | 1 (33.3) | 29 (80.6) | 92 (88.5) | 122 (85.3) | 720 (81.2) | 842 (81.7) |
| 4 | 2 (66.7) | 3 (8.3) | 6 (5.8) | 11 (7.7) | 111 (12.5) | 122 (11.8) |
| Subtotal | 3 | 36 | 104 | 143 | 887 | 1030 |
| cN Stage | ||||||
| 0 | 1 (33.3) | 9 (26.5) | 33 (33.3) | 43 (31.6) | 255 (30.9) | 298 (31.0) |
| 1 | 1 (33.3) | 20 (58.8) | 50 (50.5) | 71 (52.2) | 470 (56.9) | 541 (56.2) |
| 2 | 1 (33.3) | 5 (14.7) | 16 (16.2) | 22 (16.2) | 101 (12.2) | 123 (12.8) |
| Subtotal | 3 | 34 | 99 | 136 | 826 | 962 |
| Overall Clinical Stage | ||||||
| 2 | 1 (33.3) | 11 (31.4) | 33 (33.3) | 45 (32.8) | 247 (29.9) | 292 (30.3) |
| 3 | 2 (66.7) | 24 (68.6) | 66 (66.7) | 92 (67.2) | 579 (70.1) | 671 (69.7) |
| Subtotal | 3 | 35 | 99 | 137 | 826 | 963 |
| ypT Stage | ||||||
| 0 | 1 (33.3) | 3 (8.6) | 20 (19.4) | 24 (17.0) | 138 (15.5) | 162 (15.7) |
| 1 | 0 | 4 (11.4) | 3 (2.9) | 7 (5.0) | 54 (6.1) | 61 (5.9) |
| 2 | 0 | 6 (17.1) | 24 (23.3) | 30 (21.3) | 225 (25.3) | 255 (24.7) |
| 3 | 2 (66.7) | 19 (54.3) | 53 (51.5) | 74 (52.5) | 422 (47.4) | 496 (48.1) |
| 4 | 0 | 3 (8.6) | 3 (2.9) | 6 (4.3) | 52 (5.8) | 58 (5.6) |
| Subtotal | 3 | 35 | 103 | 141 | 891 | 1032 |
| ypN Stage | ||||||
| 0 | 1 (33.3) | 21 (58.3) | 59 (56.7) | 81 (56.6) | 582 (64.6) | 663 (63.5) |
| 1 | 2 (66.7) | 9 (25.0) | 30 (28.8) | 41 (28.7) | 249 (27.6) | 290 (27.8) |
| 2 | 0 | 6 (16.7) | 15 (14.4) | 21 (14.7) | 70 (7.8) | 91 (8.7) |
| Subtotal | 3 | 36 | 104 | 143 | 901 | 1044 |
| Overall Pathological Stage | ||||||
| 0 | 1 (33.3) | 3 (8.6) | 14 (13.6) | 18 (12.8) | 123 (13.9) | 141 (13.7) |
| 1 | 0 | 3 (8.6) | 3 (2.9) | 6 (4.3) | 45 (5.1) | 51 (5.0) |
| 2 | 0 | 14 (40.0) | 54 (52.4) | 68 (48.2) | 437 (49.4) | 505 (49.2) |
| 3 | 2 (66.7) | 15 (42.9) | 32 (31.1) | 49 (34.8) | 280 (31.6) | 329 (32.1) |
| Subtotal | 3 | 35 | 103 | 141 | 885 | 1026 |
| T Stage Change | ||||||
| Downstage | 3 (100.0) | 14 (40.0) | 48 (47.1) | 65 (46.4) | 438 (50.6) | 503 (50.0) |
| No change | 0 | 18 (51.4) | 52 (51.0) | 70 (50.0) | 400 (46.2) | 470 (46.7) |
| Upstage | 0 | 3 (8.6) | 2 (2.0) | 5 (3.6) | 28 (3.2) | 33 (3.3) |
| Subtotal | 3 | 35 | 102 | 140 | 866 | 1006 |
| N Stage Change | ||||||
| Downstage | 1 (33.3) | 14 (41.2) | 34 (34.7) | 49 (36.3) | 365 (44.8) | 414 (43.6) |
| No change | 2 (66.7) | 14 (41.2) | 48 (49.0) | 64 (47.4) | 346 (42.5) | 410 (43.2) |
| Upstage | 0 | 6 (17.6) | 16 (16.3) | 22 (16.3) | 104 (12.8) | 126 (13.3) |
| Subtotal | 3 | 34 | 98 | 135 | 815 | 950 |
| Overal Stage Change | ||||||
| Downstage | 1 (33.3) | 13 (38.2) | 49 (50.5) | 63 (47.0) | 411 (51.1) | 474 (50.5) |
| No change | 2 (66.7) | 18 (52.9) | 43 (44.3) | 63 (47.0) | 357 (44.3) | 420 (44.7) |
| Upstage | 0 | 3 (8.8) | 5 (5.2) | 8 (6.0) | 37 (4.6) | 45 (4.8) |
| Subtotal | 3 | 34 | 97 | 134 | 805 | 939 |
| Total Number of Nodes | ||||||
| <12 | 0 | 7 (19.4) | 35 (33.3) | 42 (29.2) | 401 (44.2) | 443 (42.1) |
| >=12 | 3 (100.0) | 29 (80.6) | 70 (66.7) | 102 (70.8) | 507 (55.8) | 609 (57.9) |
| Subtotal | 3 | 36 | 105 | 144 | 908 | 1052 |
| Total Number of Positive Nodes | ||||||
| 0 | 1 (33.3) | 22 (61.1) | 62 (59.0) | 85 (59.0) | 601 (67.9) | 686 (66.7) |
| 1-2 | 1 (33.3) | 6 (16.7) | 25 (23.8) | 32 (22.2) | 184 (20.8) | 216 (21.0) |
| 3-4 | 1 (33.3) | 2 (5.6) | 9 (8.6) | 12 (8.3) | 51 (5.8) | 63 (6.1) |
| 5-9 | 0 | 2 (5.6) | 7 (6.7) | 9 (6.2) | 39 (4.4) | 48 (4.7) |
| 10+ | 0 | 4 (11.1) | 2 (1.9) | 6 (4.2) | 10 (1.1) | 16 (1.6) |
| Subtotal | 3 | 36 | 105 | 144 | 885 | 1029 |
| pCR Status | ||||||
| No | 2 (66.7) | 32 (91.4) | 89 (86.4) | 123 (87.2) | 759 (86.1) | 882 (86.2) |
| Yes | 1 (33.3) | 3 (8.6) | 14 (13.6) | 18 (12.8) | 123 (13.9) | 141 (13.8) |
| Subtotal | 3 | 35 | 103 | 141 | 882 | 1023 |
figure1_data <- data_filtered %>%
filter(M1_at_Dx == 0) %>%
select(cT, cN, overall_clin_stg, pT, pN, overall_path_stg, age_group1, age_group2) %>%
rename(`Overall Clinical` = overall_clin_stg,
`Overall Pathological` = overall_path_stg) %>%
mutate_all(as.factor)
figure1_color <- c("#e0f3f8", "#74add1", "#fdae61", "#f46d43", "#a50026")
figure1_panelA_plot <- function(df, var_name) {
df %>%
filter(!is.na({{var_name}})) %>%
group_by({{var_name}}, age_group2, .drop = FALSE) %>%
summarize(n = n()) %>%
group_by(age_group2) %>%
mutate(n_total = sum(n),
percent = round(n / n_total *100, 2)) %>%
ungroup() %>%
bind_cols(binom.confint(.$n, .$n_total, conf.level = 0.95, methods = "exact")[c("lower", "upper")]*100) %>%
ggplot(aes(age_group2, percent, group = {{var_name}})) +
geom_point(aes(color = {{var_name}}), size = 2.5) +
geom_line(aes(color = {{var_name}}), size = 1) +
geom_ribbon(aes(ymin = lower, ymax = upper, fill = {{var_name}}), alpha = 0.1) +
scale_color_manual(values = figure1_color,
name = str_wrap(paste0(deparse(substitute(var_name)), " Stage"), 10)) +
scale_fill_manual(values = figure1_color,
name = str_wrap(paste0(deparse(substitute(var_name)), " Stage"), 10)) +
scale_x_discrete(labels = c("<=29" = "\u226429",
">=50" = "\u226550")) +
labs(x = "Age Group",
y = "Percentage (%)",
title = str_wrap(paste0("Distribution of ", deparse(substitute(var_name)), " Stages"), 45)) +
ylim(0, 100) +
theme_light(base_size = 18) +
theme(axis.text = element_text(size = 18))
}
figure1_panelB_plot_side <- function(df, var_name) {
df %>%
filter(!is.na({{var_name}})) %>%
group_by({{var_name}}, age_group1) %>%
summarize(n = n()) %>%
group_by(age_group1) %>%
mutate(n_total = sum(n),
percent = round(n / n_total *100, 2)) %>%
ggplot(aes(age_group1, percent, fill = {{var_name}})) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = paste0(sprintf("%.1f", percent), "%\n(", n, ")")),
position = position_dodge(width = 0.9), vjust = -0.25,
color = "black", size = 3.5) +
scale_fill_manual(values = figure1_color,
name = str_wrap(paste0(deparse(substitute(var_name)), " Stage"), 10)) +
scale_x_discrete(labels = c(">=50" = "\u226550")) +
labs(x = "Age Group",
y = "Percentage (%)",
title = str_wrap(paste0("Distribution of ", deparse(substitute(var_name)), " Stages"), 45)) +
theme_light(base_size = 18) +
theme(axis.text = element_text(size = 18)) +
ylim(0, 100)
}
figure1_panelB_plot_stack <- function(df, var_name) {
df %>%
filter(!is.na({{var_name}})) %>%
group_by({{var_name}}, age_group1) %>%
summarize(n = n()) %>%
group_by(age_group1) %>%
mutate(n_total = sum(n),
percent = round(n / n_total *100, 2),
y_pos_left = if_else(age_group1 == "<50", 100 - (cumsum(percent) - percent/2), NA_real_),
y_pos_right = if_else(age_group1 == ">=50", 100 - (cumsum(percent) - percent/2), NA_real_)) %>%
ggplot(aes(age_group1, percent, fill = {{var_name}})) +
geom_bar(stat = "identity", position = "stack", width = 0.3) +
geom_label_repel(aes(x = 0.85, y = y_pos_left,
label = paste0(sprintf("%.1f", percent), "%\n(", n, ")")),
nudge_x = -0.4, direction = "y", box.padding = 0.5,
color = "black", alpha = 0.8, size = 3.5, fontface = "bold",
show.legend = FALSE) +
geom_label_repel(aes(x = 2.15, y = y_pos_right,
label = paste0(sprintf("%.1f", percent), "%\n(", n, ")")),
nudge_x = 0.4, direction = "y", box.padding = 0.5,
color = "black", alpha = 0.8, size = 3.5, fontface = "bold",
show.legend = FALSE) +
scale_fill_manual(values = figure1_color,
name = str_wrap(paste0(deparse(substitute(var_name)), " Stage"), 10)) +
scale_x_discrete(labels = c(">=50" = "\u226550")) +
labs(x = "Age Group",
y = "Percentage (%)",
title = str_wrap(paste0("Distribution of ", deparse(substitute(var_name)), " Stages"), 45)) +
theme_light(base_size = 18) +
theme(axis.text = element_text(size = 18))
}
figure2_data <- data_filtered %>%
filter(M1_at_Dx == 0,
neoadjuvant == 1) %>%
select(cT, cN, overall_clin_stg, pT, pN, overall_path_stg,
T_stg_change_status, N_stg_change_status, overall_stg_change_status,
rt_course, pCR, age_group1, age_group2) %>%
mutate(overall_stg_change_status = case_when(pCR == "Yes" ~ "Complete",
TRUE ~ as.character(overall_stg_change_status))) %>%
mutate_all(as.factor) %>%
rename(`Overall Clinical` = overall_clin_stg,
`Overall Pathological` = overall_path_stg)
figure2_panelA_color <- c("#abd9e9", "#fdae61")
figure2_panelA_plot <- function(df, age_group_var) {
df %>%
filter(!is.na(rt_course), rt_course != "Long-course RT") %>%
group_by(rt_course, {{age_group_var}}) %>%
summarize(n = n()) %>%
group_by({{age_group_var}}) %>%
mutate(n_total = sum(n),
percent = round(n / n_total * 100, 1)) %>%
ggplot(aes(x = factor(1), y = percent, fill = rt_course)) +
geom_bar(stat = "identity", color = "white") +
coord_polar("y", start = 0) +
facet_grid(cols = vars({{age_group_var}}), labeller = as_labeller(c("<50" = "Age group: <50",
">=50" = "Age group: \u226550",
"<=29" = "Age group: \u226429",
"30-39" = "30-39",
"40-49" = "40-49"))) +
geom_text(aes(label = paste0(n,"\n(", sprintf("%.1f", percent), "%)")),
position = position_stack(vjust = 0.5),
color = "black", size = 3.5) +
scale_fill_manual(values = figure2_panelA_color,
name = "Neoadjuvant Treatment") +
theme_void(base_size = 14) +
theme(strip.text = element_text(margin = margin(b = 5)))
}
figure2_panelB_color <- c("#313695", "#abd9e9", "#f46d43", "#a50026")
figure2_panelB_plot <- function(df, age_group_var) {
df %>%
filter(rt_course == "Long-course chemoRT") %>%
group_by({{age_group_var}}, `Overall Clinical`, `Overall Pathological`, overall_stg_change_status) %>%
drop_na() %>%
summarize(n = n()) %>%
group_by({{age_group_var}}) %>%
mutate(n_total = sum(n),
percent = round(n / n_total * 100, 1)) %>%
ggplot(aes(y = percent, axis1 = `Overall Clinical`, axis2 = `Overall Pathological`)) +
coord_cartesian(clip = "off") +
geom_alluvium(aes(fill = overall_stg_change_status),
width = 1/12, reverse =FALSE) +
geom_stratum(width = 0.1, reverse =FALSE) +
geom_label(stat = "stratum", aes(label = after_stat(stratum)),
size = 4, fontface = "bold",
label.size = NA, fill = NA, reverse =FALSE) +
facet_wrap(vars({{age_group_var}}), labeller = as_labeller(c("<50" = "Age group: <50",
">=50" = "Age group: \u226550")),
scales = "free") +
scale_x_discrete(limits = c("Overall\nClinical\nStage", "Overall\nPathological\nStage"),
expand = c(0.01, 0.01)) +
scale_fill_manual(values = figure2_panelB_color,
name = "Stage Change") +
labs(title = str_wrap(paste0("Overall Stage Change"), 45)) +
theme_void(base_size = 12) +
theme(axis.text.x = element_text(size = 12),
plot.margin = margin(rep(25,10)),
aspect.ratio = 1.2,
panel.spacing = unit(3, "lines"),
strip.text = element_text(margin = margin(b = 5)))
}
figure2_panelC_color <- c("#fdae61", "#abd9e9")
figure2_panelC_plot <- function(df, age_group_var) {
temp_df <- df %>%
filter(!is.na(pCR), rt_course == "Long-course chemoRT") %>%
group_by({{age_group_var}}, pCR) %>%
summarize(n = n()) %>%
group_by({{age_group_var}}) %>%
mutate(n_total = sum(n),
percent = round(n / n_total *100, 2)) %>%
ungroup() %>%
bind_cols(binom.confint(.$n, .$n_total, conf.level = 0.95, methods = "exact")[c("lower", "upper")]*100)
n_total_label <- temp_df %>%
select({{age_group_var}}, n_total) %>%
unique()
temp_df %>%
ggplot(aes(x = {{age_group_var}}, y = percent, fill = pCR)) +
geom_bar(stat = "identity", position = "dodge") +
geom_errorbar(aes(ymin = lower, ymax = upper), width = .2, position = position_dodge(0.9)) +
scale_fill_manual(values = figure2_panelC_color,
name = "pCR") +
labs(x = "Age Group",
y = "pCR Rate (%)",
title = str_wrap("pCR rate in different age groups", 45)) +
scale_x_discrete(labels = c("<=29" = paste0("\u226429 \n(n=", filter(n_total_label, {{age_group_var}} == "<=29")$n_total, ")"),
"30-39" = paste0("30-39 \n(n=", filter(n_total_label, {{age_group_var}} == "30-39")$n_total, ")"),
"40-49" = paste0("40-49 \n(n=", filter(n_total_label, {{age_group_var}} == "40-49")$n_total, ")"),
"<50" = paste0("<50 \n(n=", filter(n_total_label, {{age_group_var}} == "<50")$n_total, ")"),
">=50" = paste0("\u226550 \n(n=", filter(n_total_label, {{age_group_var}} == ">=50")$n_total, ")"))) +
ylim(0, 100) +
theme_light(base_size = 18) +
theme(axis.text = element_text(size = 18),
strip.text = element_text(margin = margin(b = 5)))
}
figure3_data <- data_filtered %>%
select(age_group1, age_group2, diagnosis_date, M1_at_Dx, neoadjuvant, rt_course,
survyrs, pat_status, surv_status, death_date,
colorectaldeath, colorectaldthdat, crc_death_status,
m1dtofdx, m1site, fst_relapse_date, fst_reg_dist_date,
LOCIND, LOCDATE, REGIND, REGDATE, DISTIND, DISTDATE,
subseq_colorectalca, subseq_CRC_dx_date,
recurr_status, dis_free_yrs, crc_recurr_status, crc_recurr_free_yrs)
figure3_plot_maxyear <- function(df, time_var, status_var, age_group_var, title_text, break_yr) {
fit <- eval(substitute(survfit(Surv(time_var, status_var) ~ age_group_var, data = df)))
figure3_color <- case_when(length(fit[["strata"]]) == 2 ~ list(c("#0571b0", "#ca0020")),
length(fit[["strata"]]) == 3 ~ list(c("#abd9e9", "#fdae61", "#f46d43")),
length(fit[["strata"]]) == 4 ~ list(c("#abd9e9", "#fdae61", "#f46d43", "#a50026")))[[1]]
legend_text <- if_else(deparse(substitute(age_group_var)) == "age_group1", list(c("<50", "\u226550")),
list(c("\u226429", "30-39", "40-49", "\u226550")))[[1]]
ggsurvplot(fit,
palette = figure3_color,
pval = TRUE,
conf.int = FALSE,
risk.table = TRUE,
tables.height = 0.3,
risk.table.fontsize = 3.5,
break.time.by = break_yr,
surv.median.line = "hv",
xlab = "Time (year)",
ylab = "Survival Probability",
title = str_wrap(title_text, 62),
legend.title = "Age Group",
legend.labs = legend_text[1:length(figure3_color)])
}
figure3_plot_month <- function(df, time_var, status_var, age_group_var, title_text, break_yr, max_yr) {
# df <- df %>%
# mutate({{status_var}} := case_when({{time_var}} > max_yr ~ 0,
# TRUE ~ {{status_var}}),
# {{time_var}} := case_when({{time_var}} > max_yr ~ max_yr,
# TRUE ~ {{time_var}}))
fit <- eval(substitute(survfit(Surv(time_var, status_var) ~ age_group_var, data = df)))
figure3_color <- case_when(length(fit[["strata"]]) == 2 ~ list(c("#0571b0", "#ca0020")),
length(fit[["strata"]]) == 3 ~ list(c("#abd9e9", "#fdae61", "#f46d43")),
length(fit[["strata"]]) == 4 ~ list(c("#abd9e9", "#fdae61", "#f46d43", "#a50026")))[[1]]
legend_text <- if_else(deparse(substitute(age_group_var)) == "age_group1", list(c("<50", "\u226550")),
list(c("\u226429", "30-39", "40-49", "\u226550")))[[1]]
ggsurvplot(fit,
palette = figure3_color,
xscale = "y_m",
pval = TRUE,
conf.int = FALSE,
risk.table = TRUE,
tables.height = 0.3,
risk.table.fontsize = 3.5,
break.time.by = break_yr,
surv.median.line = "hv",
xlab = "Time (month)",
ylab = "Survival Probability",
title = str_wrap(title_text, 62),
legend.title = "Age Group",
legend.labs = legend_text[1:length(figure3_color)],
xlim = c(0, max_yr))
}
figure4_data <- data_filtered %>%
filter(M1_at_Dx == 0,
rt_course != "Long-course RT" | is.na(rt_course),
!(rt_merged %in% c(5, 6, 77, 88, NA)),
!is.na(loc_merged),
!is.na(overall_path_stg)) %>%
droplevels() %>%
select(age_group1, age_group2, sex, loc_merged, overall_clin_stg,
overall_path_stg, neoadjuvant, rt_merged, rt_course,
survyrs, pat_status, surv_status, colorectaldeath, crc_death_status,
LOCIND, REGIND, DISTIND, subseq_colorectalca, recurr_status,
dis_free_yrs, crc_recurr_status) %>%
mutate(
across(c(overall_clin_stg, overall_path_stg), ~as.factor(.x)),
Age = factor(age_group1, labels = c("<50", "50+")),
rt_course = as.character(rt_course),
rt_course = if_else(rt_merged %in% c(0, 3, 4), "No", rt_course) %>%
factor(levels = c("Long-course chemoRT", "Short-course RT", "No"))) %>%
filter(!(neoadjuvant == 0 & rt_course %in% c("Long-course chemoRT", "Short-course RT"))) %>%
rename(Sex = sex,
`Tumour Location` = loc_merged,
`Overall Pathological Stage` = overall_path_stg,
`Neoadjuvant Therapy` = rt_course)
figure4_plot <- function(df, time_var, status_var, title_text) {
fit_ph <- eval(substitute(coxph(Surv(time_var, status_var) ~
Sex + Age + `Tumour Location` +
`Overall Pathological Stage` +
`Neoadjuvant Therapy`,
data = as.data.frame(df))))
ggforest(fit_ph,
main = title_text,
cpositions = c(0.005, 0.21, 0.4),
fontsize = 0.6,
refLabel = "Reference")
}